home *** CD-ROM | disk | FTP | other *** search
Text File | 2000-10-06 | 15.9 KB | 706 lines | [TEXT/CWIE] |
- ///--------------------------------------------------------------------------------------
- // BlitPixieDoubleRects
- //
- // a blitter to merge two offscreen areas into one onscreen area
- //
- // Written by Anders F Björklund <afb@algonet.se>, Nov. 1999
- // ©1997-99 afb. All rights reserved.
- ///--------------------------------------------------------------------------------------
-
- #ifndef __BLITPIXIE__
- #include "BlitPixieHeader.h"
- #endif
-
- #include "BlitPixieAsm.h"
-
- #pragma mark *** PowerPC asm:
- #if USE_PPC_ASSEMBLY
-
- // NOTE: This implementation _always_ blits aligned doubles to the screen
- // by using a temporary buffer if needed (to avoid alignment exceptions)
-
- // NOTE: Assumes rows > 0
- // Aligning code assumes that rowBytes is multiple of 4
-
-
- // Note: the following definitions avoids a bug in CodeWarrior Pro 5's assembler,
- // where bnl instructions are mistakenly compiled to bgel, instead of bnl or bge.
- // Geez Metrowerks, be more careful next time!
-
- #define IF_SET 0x0C // constant for BO field
- #define IF_NOT 0x04 // constant for BO field
-
- ASM_FUNC void BlitPixieDoubleRects(
- register unsigned char *src,
- register unsigned char *dst,
- register unsigned short bytesA,
- register unsigned short bytesB,
- register unsigned short rows,
- register BlitPixieOffsetInfoPtr info)
- {
- #define r_temp r31
- #define r_y r31 // recycled
- #define r_alignA r30
- #define r_alignB r29
- #define r_leftdoubles r30 // recycled
- #define r_rightdoubles r29 // recycled
-
- #define r_srcOffsetAtoB r28
- #define r_srcOffsetBtoA r27
- #define r_dstOffsetAtoB r26
- #define r_dstOffsetBtoA r25
-
- #define r_leftblocks r24
- #define r_left r23
- #define r_rightblocks r22
- #define r_right r21
-
- #define r_buffer r20
- #define r_offset r19
-
- #define kRegisterSaveStack (13 * 4) // # registers to save * sizeof(register)
-
- ASM_BEGIN
- stmw r19,-kRegisterSaveStack(SP) // save registers on stack (in the "red zone")
-
- // PARAMETERS:
- #define r_src r3
- #define r_dst r4
- #define r_bytesA r5
- #define r_bytesB r6
- #define r_rows r7
- #define r_info r8
-
- // srcOffsetAtoB = info->srcOffsetAtoB;
- // srcOffsetBtoA = info->srcOffsetBtoA;
- // dstOffsetAtoB = info->dstOffsetAtoB;
- // dstOffsetBtoA = info->dstOffsetBtoA;
- lwz r_srcOffsetAtoB,0(info)
- lwz r_srcOffsetBtoA,4(info)
- lwz r_dstOffsetAtoB,8(info)
- lwz r_dstOffsetBtoA,12(info)
-
- // get a cache-block (32 byte) aligned stack storage for the (32 byte) buffer
- addi r_buffer,SP,-(kRegisterSaveStack + 32)
- rlwinm r_buffer,r_buffer,0,0,26
- dcbz r0,r_buffer
-
- // alignment offset for rect A
- neg r_alignA,r4
- rlwinm r_alignA,r_alignA,0,29,31
- cmplw r_alignA,r_bytesA
- ble @alignAok
- mr r_alignA,r_bytesA
- @alignAok:
-
- // compute source alignment for A
- neg r9,r3
- rlwinm r9,r9,0,30,31
-
- // alignment offset for rect B
- add r0,r_bytesA,r_dstOffsetAtoB
- add r_alignB,r4,r0
- neg r_alignB,r_alignB
- rlwinm r_alignB,r_alignB,0,29,31
- cmplw r_alignB,r_bytesB
- ble @alignBok
- mr r_alignB,r_bytesB
- @alignBok:
-
- // compute source alignment for B
- add r0,r_bytesA,r_srcOffsetAtoB
- add r10,r3,r0
- neg r10,r10
- rlwinm r10,r10,0,30,31
-
- sub r_bytesA,r_bytesA,r_alignA
- sub r_bytesB,r_bytesB,r_alignB
-
- // ———————————————————————————————————————————————————————————————————————————
-
- //pre-calculate transfer sizes
- rlwinm r_leftblocks,r_bytesA,32-5,5,31 // leftblocks = bytesA / 32;
- rlwinm r_left,r_bytesA,0,27,31 // left = bytesA % 32;
- rlwinm r_rightblocks,r_bytesB,32-5,5,31 // rightblocks = bytesB / 32;
- rlwinm r_right,r_bytesB,0,27,31 // right = bytesB % 32;
-
- mfcr r0 // save the CR in r0
- // NOTE : don't use r0 from here and below (except as zero)
-
- // NOTE: condition register usage below: (• = unused)
- // lt gt eq so
- // cr0 <-------------rows (y)--------------------->
- // cr1 • • • •
- // cr2 A aligned ? <-------alignA & 7------------->
- // cr3 <--------------left > 32------------------->
- // cr4 left > 8 <--------left & 7--------------> (or just left % 32 > 0)
- // cr5 B aligned ? <-------alignB & 7------------->
- // cr6 <-------------right > 32------------------->
- // cr7 right > 8 <--------right & 7-------------> (or just right % 32 > 0)
-
- rlwinm r_alignA,r_alignA,5*4,9,11
- rlwinm r_alignB,r_alignB,2*4,21,23
-
- mtcrf 1<<(7-2),r_alignA // cr2 = alignA & 7
- mtcrf 1<<(7-5),r_alignB // cr5 = alignB & 7
-
- rlwinm r_alignA,r_alignA,32-5*4,29,31 // shift back to normal again
- rlwinm r_alignB,r_alignB,32-2*4,29,31
-
- lwz r_temp,gBlitPixieProcessorType(RTOC)
- lha r_temp,0(r_temp)
- cmplwi r_temp,k601 // can we blit unaligned doubles ? (i.e. on a PPC 601)
- bne @cantBlitUnaligned
-
- @canBlitUnaligned:
- creqv 0*4+2,0*4+2,0*4+2 // crset cr0_EQ
- creqv 1*4+2,1*4+2,1*4+2 // crset cr1_EQ
- b @endcantBlitUnaligned
-
- @cantBlitUnaligned:
- rlwinm r_temp,r_alignA,0,30,31
- cmplwi cr4,r_left,0 // cr4 = left > 0 (for unaligned)
- cmplw cr0,r_temp,r9
- rlwinm r_temp,r_alignB,0,30,31
- cmplwi cr7,r_right,0 // cr7 = right > 0 (for unaligned)
- cmplw cr1,r_temp,r10
- @endcantBlitUnaligned:
-
- cmplwi cr3,r_leftblocks,0 // cr3 = left > 32
- cmplwi cr6,r_rightblocks,0 // cr6 = right > 32
-
- bne cr0,@skipLeftAlign
- rlwinm. r_leftdoubles,r_left,32-3,30,31
- rlwinm r_temp,r_left,3*4,17,19
- creqv 2*4+0,2*4+0,2*4+0 // cr2_LT = is left aligned ? = true
- mtcrf 1<<(7-4),r_temp // cr4 = left & 7 (for aligned)
- crnor 4*4+0,0*4+2,0*4+2 // cr4_LT = left > 8 (for aligned)
- @skipLeftAlign:
-
- bne cr1,@skipRightAlign
- rlwinm. r_rightdoubles,r_right,32-3,30,31
- rlwinm r_temp,r_right,0*4,29,31
- creqv 5*4+0,5*4+0,5*4+0 // cr5_LT = is right aligned ? = true
- mtcrf 1<<(7-7),r_temp // cr7 = right & 7 (for aligned)
- crnor 7*4+0,0*4+2,0*4+2 // cr7_LT = right > 8 (for aligned)
- @skipRightAlign:
-
- mr r_y,r_rows
-
- li r_offset,32
- subi r3,r3,32
- subi r4,r4,32
-
- // NOTE: from here on, regs r5-r12 are scratch!
- @rowloop:
- // ——————————————————————————————————————————————————————————————————————————— RECT A (left)
-
- // align destination to 8-byte boundary
- bc IF_NOT,2*4+3, @skipalignAByte
- lbz r5,32(r3)
- addi r3,r3,1
- stb r5,32(r4)
- addi r4,r4,1
- @skipalignAByte:
- bc IF_NOT,2*4+2, @skipalignAWord
- lhz r5,32(r3)
- addi r3,r3,2
- sth r5,32(r4)
- addi r4,r4,2
- @skipalignAWord:
- bc IF_NOT,2*4+1, @skipalignALong
- lwz r5,32(r3)
- addi r3,r3,4
- stw r5,32(r4)
- addi r4,r4,4
- @skipalignALong:
-
- bc IF_SET,2*4+0, @leftAligned
-
- // --------------------------------- NOT ALIGNED
- @leftNotAligned:
-
- mtctr r_leftblocks
- mtxer r_left
-
- // copy 32 byte blocks (w/ buffer)
- beq cr3,@skipleft
- @leftloop:
- lwzu r5,32(r3)
- lwz r6,4(r3)
- lwz r7,8(r3)
- lwz r8,12(r3)
- lwz r9,16(r3)
- lwz r10,20(r3)
- lwz r11,24(r3)
- lwz r12,28(r3)
- stw r5,0(r_buffer)
- stw r6,4(r_buffer)
- stw r7,8(r_buffer)
- stw r8,12(r_buffer)
- stw r9,16(r_buffer)
- stw r10,20(r_buffer)
- stw r11,24(r_buffer)
- stw r12,28(r_buffer)
- lfd fp1,0(r_buffer)
- lfd fp2,8(r_buffer)
- lfd fp3,16(r_buffer)
- lfd fp4,24(r_buffer)
- stfdu fp1,32(r4)
- stfd fp2,8(r4)
- stfd fp3,16(r4)
- stfd fp4,24(r4)
- bdnz @leftloop
- @skipleft:
-
- // copy left-over bytes (<32)
- beq cr4,@endleft
- lswx r5,r_offset,r3
- add r3,r3,r_left
- stswx r5,r_offset,r4
- add r4,r4,r_left
- b @endleft
-
- // --------------------------------- ALIGNED
-
- @leftAligned: // copy 32 byte blocks (wo/ buffer)
-
- beq cr3,@skipleftAligned
- mtctr r_leftblocks
- @leftloopAligned:
- lfdu fp1,32(r3)
- lfd fp2,8(r3)
- lfd fp3,16(r3)
- lfd fp4,24(r3)
- stfdu fp1,32(r4)
- stfd fp2,8(r4)
- stfd fp3,16(r4)
- stfd fp4,24(r4)
- bdnz @leftloopAligned
- @skipleftAligned:
-
- // copy left-over bytes (<32)
- bc IF_NOT,4*4+0, @skipADoubles
- mtctr r_leftdoubles
- @leftloopDouble:
- lfd fp0,32(r3)
- addi r3,r3,8
- stfd fp0,32(r4)
- addi r4,r4,8
- bdnz @leftloopDouble
- @skipADoubles:
- bc IF_NOT,4*4+1, @skipALong
- lwz r5,32(r3)
- addi r3,r3,4
- stw r5,32(r4)
- addi r4,r4,4
- @skipALong:
- bc IF_NOT,4*4+2, @skipAWord
- lhz r5,32(r3)
- addi r3,r3,2
- sth r5,32(r4)
- addi r4,r4,2
- @skipAWord:
- bc IF_NOT,4*4+3, @skipAByte
- lbz r5,32(r3)
- addi r3,r3,1
- stb r5,32(r4)
- addi r4,r4,1
- @skipAByte:
-
- @endleft:
- add r3,r3,r_srcOffsetAtoB
- add r4,r4,r_dstOffsetAtoB
-
- // ——————————————————————————————————————————————————————————————————————————— RECT B (right)
-
- // align destination to 8-byte boundary
- bc IF_NOT,5*4+3, @skipalignBByte
- lbz r5,32(r3)
- addi r3,r3,1
- stb r5,32(r4)
- addi r4,r4,1
- @skipalignBByte:
- bc IF_NOT,5*4+2, @skipalignBWord
- lhz r5,32(r3)
- addi r3,r3,2
- sth r5,32(r4)
- addi r4,r4,2
- @skipalignBWord:
- bc IF_NOT,5*4+1, @skipalignBLong
- lwz r5,32(r3)
- addi r3,r3,4
- stw r5,32(r4)
- addi r4,r4,4
- @skipalignBLong:
-
- bc IF_SET,5*4+0, @rightAligned
-
- // --------------------------------- UNALIGNED
- @rightNotAligned:
-
- mtctr r_rightblocks
- mtxer r_right
-
- // copy 32 byte blocks (w/ buffer)
- beq cr6,@skipright
- @rightloop:
- lwzu r5,32(r3)
- lwz r6,4(r3)
- lwz r7,8(r3)
- lwz r8,12(r3)
- lwz r9,16(r3)
- lwz r10,20(r3)
- lwz r11,24(r3)
- lwz r12,28(r3)
- stw r5,0(r_buffer)
- stw r6,4(r_buffer)
- stw r7,8(r_buffer)
- stw r8,12(r_buffer)
- stw r9,16(r_buffer)
- stw r10,20(r_buffer)
- stw r11,24(r_buffer)
- stw r12,28(r_buffer)
- lfd fp1,0(r_buffer)
- lfd fp2,8(r_buffer)
- lfd fp3,16(r_buffer)
- lfd fp4,24(r_buffer)
- stfdu fp1,32(r4)
- stfd fp2,8(r4)
- stfd fp3,16(r4)
- stfd fp4,24(r4)
- bdnz @rightloop
- @skipright:
-
- // copy left-over bytes (<32)
- beq cr7,@endright
- lswx r5,r_offset,r3
- add r3,r3,r_right
- stswx r5,r_offset,r4
- add r4,r4,r_right
- b @endright
-
- // --------------------------------- ALIGNED
-
- // copy 32 byte blocks (wo/ buffer)
- @rightAligned:
-
- beq cr6,@skiprightAligned
- mtctr r_rightblocks
- @rightloopAligned:
- lfdu fp1,32(r3)
- lfd fp2,8(r3)
- lfd fp3,16(r3)
- lfd fp4,24(r3)
- stfdu fp1,32(r4)
- stfd fp2,8(r4)
- stfd fp3,16(r4)
- stfd fp4,24(r4)
- bdnz @rightloopAligned
- @skiprightAligned:
-
- // copy left-over bytes (<32)
- bc IF_NOT,7*4+0, @skipBDoubles
- mtctr r_rightdoubles
- @rightloopDouble:
- lfd fp0,32(r3)
- addi r3,r3,8
- stfd fp0,32(r4)
- addi r4,r4,8
- bdnz @rightloopDouble
- @skipBDoubles:
- bc IF_NOT,7*4+1, @skipBLong
- lwz r5,32(r3)
- addi r3,r3,4
- stw r5,32(r4)
- addi r4,r4,4
- @skipBLong:
- bc IF_NOT,7*4+2, @skipBWord
- lhz r5,32(r3)
- addi r3,r3,2
- sth r5,32(r4)
- addi r4,r4,2
- @skipBWord:
- bc IF_NOT,7*4+3, @skipBByte
- lbz r5,32(r3)
- addi r3,r3,1
- stb r5,32(r4)
- addi r4,r4,1
- @skipBByte:
-
- @endright:
-
- // ———————————————————————————————————————————————————————————————————————————
-
- subic. r_y,r_y,1
-
- add r3,r3,r_srcOffsetBtoA
- add r4,r4,r_dstOffsetBtoA
-
- bne @rowloop
-
- @end:
- mtcrf 0xFF,r0 // restore the CR (mtcr r0)
- @gohome:
- lmw r19,-kRegisterSaveStack(SP) // restore registers from stack
-
- ASM_END
- }
-
- #pragma mark *** 680X0 asm:
- #elif USE_68K_ASSEMBLY
-
- // NOTE: Assumes rows > 0
-
- // NOTE: Aligning code assumes that rowBytes is multiple of 4
-
- ASM_FUNC void BlitPixieDoubleRects(
- unsigned char *src,
- unsigned char *dst,
- unsigned short bytesA,
- unsigned short bytesB,
- unsigned short rows,
- BlitPixieOffsetInfoPtr info)
- {
- // VARIABLE(S) REGISTER
- // *temp* D0
- // *temp*,y D1
- // bytesA,left D2
- // leftblocks D3
- // bytesB,right D4
- // dst D5
- // alignA D6
- // alignB D7
- // src A0
- // dst A1
- // startA A2
- // startB A3
- // offsetinfoptr A4
-
- ASM_BEGIN
-
- MOVEM.L D3-D7/A2-A4,-(SP)
-
- MOVEA.L src,A0
- MOVEA.L dst,A1
- CLR.L D2
- MOVE.W bytesA,D2
- CLR.L D4
- MOVE.W bytesB,D4
- MOVEA.L info,A4
-
- // srcOffsetAtoB = info->srcOffsetAtoB;
- // srcOffsetBtoA = info->srcOffsetBtoA;
- // dstOffsetAtoB = info->dstOffsetAtoB;
- // dstOffsetBtoA = info->dstOffsetBtoA;
- #define D_srcOffsetAtoB (A4)
- #define D_srcOffsetBtoA 4(A4)
- #define D_dstOffsetAtoB 8(A4)
- #define D_dstOffsetBtoA 12(A4)
-
- // alignment offset for rect A
- MOVE.L A1,D6
- MOVEQ #3,D1
- NEG.L D6
- AND.W D1,D6
- CMP.W D2,D6
- BLE.S @alignAok
- MOVE.W D2,D6
- @alignAok:
- SUB.W D6,D2
-
- // alignment offset for rect B
- MOVE.L A1,D7
- ADD.L D2,D7
- ADD.L D_dstOffsetAtoB,D7
- NEG.L D7
- AND.W D1,D7
- CMP.W D4,D7
- BLE.S @alignBok
- MOVE.W D4,D7
- @alignBok:
- SUB.W D7,D4
-
- MOVEQ #15,D0
-
- //pre-calculate transfer sizes for rect A
- MOVE.W D2,D3
- LSR.W #6,D3
- MOVE.W D2,D1
- ANDI.W #3,D2
- LSR.W #2,D1 // / sizeof(long)
- AND.W D0,D1
- ADD.W D1,D1 // * sizeof(MOVE.L (A0)+,(A1)+)
- LEA @leftloopend,A2
- SUBA.L D1,A2
-
- //pre-calculate transfer sizes for rect B
- MOVE.W D4,D5
- LSR.W #6,D5
- MOVE.W D4,D1
- ANDI.W #3,D4
- LSR.W #2,D1 // / sizeof(long)
- AND.W D0,D1
- ADD.W D1,D1 // * sizeof(MOVE.L (A0)+,(A1)+)
- LEA @rightloopend,A3
- SUBA.L D1,A3
-
- MOVE.W rows,D1
-
- @rowloop:
-
- // ——————————————————————————————————————————————————————————————————————————— RECT A (left)
-
- // align destination to 4-byte boundary
- MOVE.W D6,D0
- ANDI.W #1,D0
- BEQ.S @skipalignAbyte
- MOVE.B (A0)+,(A1)+
- @skipalignAbyte:
- MOVE.W D6,D0
- ANDI.W #2,D0
- BEQ.S @skipalignAword
- MOVE.W (A0)+,(A1)+
- @skipalignAword:
-
- // copy 64 byte blocks (w/ Duff's Device)
- MOVE.W D3,D0
- JMP (A2)
- @leftloop:
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
-
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
-
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
-
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- @leftloopend:
- DBRA D0,@leftloop
-
- // copy left-over bytes
- MOVE.W D2,D0
- BEQ.S @skipleftbyte
- SUBQ.W #2,D0
- BMI.S @leftbyte
- MOVE.W (A0)+,(A1)+
- TST D0
- BEQ.S @skipleftbyte
- @leftbyte:
- MOVE.B (A0)+,(A1)+
- @skipleftbyte:
-
- ADDA.L D_srcOffsetAtoB,A0
- ADDA.L D_dstOffsetAtoB,A1
-
- // ——————————————————————————————————————————————————————————————————————————— RECT B (right)
-
- // align destination to 4-byte boundary
- MOVE.W D7,D0
- ANDI.W #1,D0
- BEQ.S @skipalignBbyte
- MOVE.B (A0)+,(A1)+
- @skipalignBbyte:
- MOVE.W D7,D0
- ANDI.W #2,D0
- BEQ.S @skipalignBword
- MOVE.W (A0)+,(A1)+
- @skipalignBword:
-
- // copy 64 byte blocks (w/ Duff's Device)
- MOVE.W D5,D0
- JMP (A3)
- @rightloop:
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
-
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
-
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
-
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- @rightloopend:
- DBRA D0,@rightloop
-
- // copy left-over bytes
- MOVE.W D4,D0
- BEQ.S @skiprightbyte
- SUBQ.W #2,D0
- BMI.S @rightbyte
- MOVE.W (A0)+,(A1)+
- TST D0
- BEQ.S @skiprightbyte
- @rightbyte:
- MOVE.B (A0)+,(A1)+
- @skiprightbyte:
-
- ADDA.L D_srcOffsetBtoA,A0
- ADDA.L D_dstOffsetBtoA,A1
-
- // ———————————————————————————————————————————————————————————————————————————
-
- SUBQ.W #1,D1
- BNE @rowloop
-
- @end:
- MOVEM.L (SP)+,D3-D7/A2-A4
-
- ASM_END
- }
-
- #pragma mark *** Generic C:
- #elif USE_GENERIC_C
-
- void BlitPixieDoubleRects(
- unsigned char *src,
- unsigned char *dst,
- unsigned short bytesA,
- unsigned short bytesB,
- unsigned short rows,
- BlitPixieOffsetInfoPtr info)
- {
- BLITPIXIE_ASSERT( rows > 0 );
- BLITPIXIE_ASSERT( bytesA > 0 );
- BLITPIXIE_ASSERT( bytesB > 0 );
-
- while ( rows-- )
- {
- BlitPixieMemCopy( dst, src, bytesA );
- src += bytesA;
- dst += bytesA;
-
- src += info->srcOffsetAtoB;
- dst += info->dstOffsetAtoB;
-
- BlitPixieMemCopy( dst, src, bytesB );
- src += bytesB;
- dst += bytesB;
-
- src += info->srcOffsetBtoA;
- dst += info->dstOffsetBtoA;
- }
- }
-
- #endif